In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import matplotlib.pyplot as plt
In [20]:
import os
In [21]:
os.listdir(r"C:\Users\kusha\OneDrive\Desktop\Projects\Python\Uber\Datasets")
Out[21]:
['other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
In [22]:
uber_15 = pd.read_csv(r"C:\Users\kusha\OneDrive\Desktop\Projects\Python\Uber\Datasets\uber-raw-data-janjune-15_sample.csv")
In [23]:
uber_15.duplicated().sum()
Out[23]:
54
In [24]:
uber_15.drop_duplicates(inplace = True)
In [25]:
uber_15.isnull().sum()
Out[25]:
Dispatching_base_num 0 Pickup_date 0 Affiliated_base_num 1116 locationID 0 dtype: int64
In [26]:
uber_15.dtypes
Out[26]:
Dispatching_base_num object Pickup_date object Affiliated_base_num object locationID int64 dtype: object
In [35]:
uber_15['Pickup_date'][0]
Out[35]:
'2015-05-02 21:43:00'
In [37]:
type(uber_15['Pickup_date'][0])
Out[37]:
str
In [39]:
uber_15['Pickup_date'] = pd.to_datetime(uber_15['Pickup_date'])
In [41]:
uber_15.dtypes
Out[41]:
Dispatching_base_num object Pickup_date datetime64[ns] Affiliated_base_num object locationID int64 dtype: object
In [43]:
uber_15
Out[43]:
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 |
| 4 | B02764 | 2015-03-23 07:03:00 | B00111 | 140 |
| ... | ... | ... | ... | ... |
| 99995 | B02764 | 2015-04-13 16:12:00 | B02764 | 234 |
| 99996 | B02764 | 2015-03-06 21:32:00 | B02764 | 24 |
| 99997 | B02598 | 2015-03-19 19:56:00 | B02598 | 17 |
| 99998 | B02682 | 2015-05-02 16:02:00 | B02682 | 68 |
| 99999 | B02764 | 2015-06-24 16:04:00 | B02764 | 125 |
99946 rows × 4 columns
Exploratory Data Analysis
In [46]:
uber_15['Month'] = uber_15['Pickup_date'].dt.month_name()
In [48]:
uber_15['Month']
Out[48]:
0 May
1 January
2 March
3 April
4 March
...
99995 April
99996 March
99997 March
99998 May
99999 June
Name: Month, Length: 99946, dtype: object
In [50]:
uber_15['Month'].value_counts().plot(kind = 'bar')
Out[50]:
<Axes: xlabel='Month'>
In [51]:
uber_15['Weekday'] = uber_15['Pickup_date'].dt.day_name()
uber_15['Day'] = uber_15['Pickup_date'].dt.day
uber_15['Hour'] = uber_15['Pickup_date'].dt.hour
uber_15['Minute'] = uber_15['Pickup_date'].dt.minute
In [54]:
uber_15.head(4)
Out[54]:
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | Month | Weekday | Day | Hour | Minute | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 | May | Saturday | 2 | 21 | 43 |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 | January | Tuesday | 20 | 19 | 52 |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 | March | Thursday | 19 | 20 | 26 |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 | April | Friday | 10 | 17 | 38 |
In [56]:
Pivot_Table = pd.crosstab(index = uber_15['Month'], columns = uber_15['Weekday'])
Pivot_Table
Out[56]:
| Weekday | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| Month | |||||||
| April | 2365 | 1833 | 2508 | 2052 | 2823 | 1880 | 2521 |
| February | 2655 | 1970 | 2550 | 2183 | 2396 | 2129 | 2013 |
| January | 2508 | 1353 | 2745 | 1651 | 2378 | 1444 | 1740 |
| June | 2793 | 2848 | 3037 | 2485 | 2767 | 3187 | 2503 |
| March | 2465 | 2115 | 2522 | 2379 | 2093 | 2388 | 2007 |
| May | 3262 | 1865 | 3519 | 2944 | 2627 | 2115 | 2328 |
In [58]:
Pivot_Table.plot(kind = 'bar', figsize = (8,6),)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize=12)
Out[58]:
<matplotlib.legend.Legend at 0x1df3f554530>
In [60]:
Pivot_Table2 = uber_15.groupby(['Weekday', 'Hour'], as_index = False).size()
Pivot_Table2
Out[60]:
| Weekday | Hour | size | |
|---|---|---|---|
| 0 | Friday | 0 | 581 |
| 1 | Friday | 1 | 333 |
| 2 | Friday | 2 | 197 |
| 3 | Friday | 3 | 138 |
| 4 | Friday | 4 | 161 |
| ... | ... | ... | ... |
| 163 | Wednesday | 19 | 1044 |
| 164 | Wednesday | 20 | 897 |
| 165 | Wednesday | 21 | 949 |
| 166 | Wednesday | 22 | 900 |
| 167 | Wednesday | 23 | 669 |
168 rows × 3 columns
In [62]:
Weekday_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
plt.figure(figsize=(8,6))
sns.pointplot(x = "Hour", y = "size", hue = "Weekday", data = Pivot_Table2, hue_order= Weekday_order)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize=12)
Out[62]:
<matplotlib.legend.Legend at 0x1df4529e180>
Customer demand peaks on Thursday, Friday, and Saturday evenings, indicating a strong opportunity for targeted promotions or increased staffing during these high-traffic periods
Order volume reaches its highest point around 11:00 PM on Saturdays, highlighting a key window for optimizing late-evening operations and targeted marketing efforts.
In [66]:
uber_15.columns
Out[66]:
Index(['Dispatching_base_num', 'Pickup_date', 'Affiliated_base_num',
'locationID', 'Month', 'Weekday', 'Day', 'Hour', 'Minute'],
dtype='object')
In [68]:
uber_foil = pd.read_csv(r"C:\Users\kusha\OneDrive\Desktop\Projects\Python\Uber\Datasets\Uber-Jan-Feb-FOIL.csv")
uber_foil
Out[68]:
| dispatching_base_number | date | active_vehicles | trips | |
|---|---|---|---|---|
| 0 | B02512 | 1/1/2015 | 190 | 1132 |
| 1 | B02765 | 1/1/2015 | 225 | 1765 |
| 2 | B02764 | 1/1/2015 | 3427 | 29421 |
| 3 | B02682 | 1/1/2015 | 945 | 7679 |
| 4 | B02617 | 1/1/2015 | 1228 | 9537 |
| ... | ... | ... | ... | ... |
| 349 | B02764 | 2/28/2015 | 3952 | 39812 |
| 350 | B02617 | 2/28/2015 | 1372 | 14022 |
| 351 | B02682 | 2/28/2015 | 1386 | 14472 |
| 352 | B02512 | 2/28/2015 | 230 | 1803 |
| 353 | B02765 | 2/28/2015 | 747 | 7753 |
354 rows × 4 columns
In [70]:
uber_foil.head(3)
Out[70]:
| dispatching_base_number | date | active_vehicles | trips | |
|---|---|---|---|---|
| 0 | B02512 | 1/1/2015 | 190 | 1132 |
| 1 | B02765 | 1/1/2015 | 225 | 1765 |
| 2 | B02764 | 1/1/2015 | 3427 | 29421 |
In [ ]:
In [5]:
!pip install chart_studio
!pip install plotly
Collecting chart_studio Downloading chart_studio-1.1.0-py3-none-any.whl.metadata (1.3 kB) Requirement already satisfied: plotly in c:\users\kusha\anaconda3\lib\site-packages (from chart_studio) (5.24.1) Requirement already satisfied: requests in c:\users\kusha\anaconda3\lib\site-packages (from chart_studio) (2.32.3) Collecting retrying>=1.3.3 (from chart_studio) Downloading retrying-1.4.0-py3-none-any.whl.metadata (7.5 kB) Requirement already satisfied: six in c:\users\kusha\anaconda3\lib\site-packages (from chart_studio) (1.16.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\kusha\anaconda3\lib\site-packages (from plotly->chart_studio) (8.2.3) Requirement already satisfied: packaging in c:\users\kusha\anaconda3\lib\site-packages (from plotly->chart_studio) (24.1) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\kusha\anaconda3\lib\site-packages (from requests->chart_studio) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in c:\users\kusha\anaconda3\lib\site-packages (from requests->chart_studio) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\kusha\anaconda3\lib\site-packages (from requests->chart_studio) (2.2.3) Requirement already satisfied: certifi>=2017.4.17 in c:\users\kusha\anaconda3\lib\site-packages (from requests->chart_studio) (2025.1.31) Downloading chart_studio-1.1.0-py3-none-any.whl (64 kB) Downloading retrying-1.4.0-py3-none-any.whl (11 kB) Installing collected packages: retrying, chart_studio Successfully installed chart_studio-1.1.0 retrying-1.4.0 Requirement already satisfied: plotly in c:\users\kusha\anaconda3\lib\site-packages (5.24.1) Requirement already satisfied: tenacity>=6.2.0 in c:\users\kusha\anaconda3\lib\site-packages (from plotly) (8.2.3) Requirement already satisfied: packaging in c:\users\kusha\anaconda3\lib\site-packages (from plotly) (24.1)
In [72]:
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
In [74]:
init_notebook_mode(connected = True)
In [76]:
uber_foil.columns
Out[76]:
Index(['dispatching_base_number', 'date', 'active_vehicles', 'trips'], dtype='object')
In [82]:
px.box(x = 'dispatching_base_number', y = 'active_vehicles' , data_frame = uber_foil)
In [ ]:
In [98]:
Files = os.listdir(r"C:\Users\kusha\OneDrive\Desktop\Projects\Python\Uber\Datasets")[-8:]
Files.remove('uber-raw-data-janjune-15.csv')
Files.remove('uber-raw-data-janjune-15_sample.csv')
Files
Out[98]:
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
In [100]:
final = pd.DataFrame()
path = r"C:\Users\kusha\OneDrive\Desktop\Projects\Python\Uber\Datasets"
for file in Files :
current_df = pd.read_csv(path+'/'+file)
final = pd.concat([current_df, final])
In [102]:
final.shape
Out[102]:
(4534327, 4)
In [104]:
final.duplicated().sum()
Out[104]:
82581
In [106]:
final.drop_duplicates(inplace = True)
In [108]:
final.duplicated().sum()
Out[108]:
0
In [110]:
final.head(3)
Out[110]:
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 9/1/2014 0:01:00 | 40.2201 | -74.0021 | B02512 |
| 1 | 9/1/2014 0:01:00 | 40.7500 | -74.0027 | B02512 |
| 2 | 9/1/2014 0:03:00 | 40.7559 | -73.9864 | B02512 |
In [116]:
rush_uber = final.groupby(['Lat', 'Lon'], as_index = False).size()
rush_uber
Out[116]:
| Lat | Lon | size | |
|---|---|---|---|
| 0 | 39.6569 | -74.2258 | 1 |
| 1 | 39.6686 | -74.1607 | 1 |
| 2 | 39.7214 | -74.2446 | 1 |
| 3 | 39.8416 | -74.1512 | 1 |
| 4 | 39.9055 | -74.0791 | 1 |
| ... | ... | ... | ... |
| 574553 | 41.3730 | -72.9237 | 1 |
| 574554 | 41.3737 | -73.7988 | 1 |
| 574555 | 41.5016 | -72.8987 | 1 |
| 574556 | 41.5276 | -72.7734 | 1 |
| 574557 | 42.1166 | -72.0666 | 1 |
574558 rows × 3 columns
In [120]:
!pip install folium
Collecting folium Downloading folium-0.20.0-py2.py3-none-any.whl.metadata (4.2 kB) Collecting branca>=0.6.0 (from folium) Downloading branca-0.8.1-py3-none-any.whl.metadata (1.5 kB) Requirement already satisfied: jinja2>=2.9 in c:\users\kusha\anaconda3\lib\site-packages (from folium) (3.1.4) Requirement already satisfied: numpy in c:\users\kusha\anaconda3\lib\site-packages (from folium) (1.26.4) Requirement already satisfied: requests in c:\users\kusha\anaconda3\lib\site-packages (from folium) (2.32.3) Requirement already satisfied: xyzservices in c:\users\kusha\anaconda3\lib\site-packages (from folium) (2022.9.0) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\kusha\anaconda3\lib\site-packages (from jinja2>=2.9->folium) (2.1.3) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\kusha\anaconda3\lib\site-packages (from requests->folium) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in c:\users\kusha\anaconda3\lib\site-packages (from requests->folium) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\kusha\anaconda3\lib\site-packages (from requests->folium) (2.2.3) Requirement already satisfied: certifi>=2017.4.17 in c:\users\kusha\anaconda3\lib\site-packages (from requests->folium) (2025.1.31) Downloading folium-0.20.0-py2.py3-none-any.whl (113 kB) Downloading branca-0.8.1-py3-none-any.whl (26 kB) Installing collected packages: branca, folium Successfully installed branca-0.8.1 folium-0.20.0
In [122]:
import folium
In [126]:
basemap = folium.Map()
In [128]:
from folium.plugins import HeatMap
In [130]:
HeatMap(rush_uber).add_to(basemap)
Out[130]:
<folium.plugins.heat_map.HeatMap at 0x1df4dd33590>
In [132]:
basemap
Out[132]:
Make this Notebook Trusted to load map: File -> Trust Notebook